Constructing Hierarchical Edge Bundling charts for molecular data.
Molecular analytes
Female data
Load required libraries, data and create the dendrogram
This code loads necessary R packages, loads the female data, standardizes the data to one unit variance across sampled groups, calculates euclidean distances, performs hierarchical clustering and creates a dendrogram.
Code
# Load required librarieslibrary(readxl)library(dplyr)library(stats)library(ggplot2)library(ggdendro)library(igraph)library(ggraph)library(tidygraph)# Step 1: Load and standardize the datadata <-read_excel(here::here("Data/Mol_raw_data.xlsx"), sheet =1)data <-as.data.frame(data)# Ensure the first column becomes the row namesrownames(data) <- data[,1]data <- data[,-1]#Standardise the data scaled_data <- vegan::decostand(data, method="standardize", MARGIN =1)# Step 2: Calculate Euclidean distancesdist_matrix <-dist(scaled_data, method ="euclidean")# Step 3: Apply hierarchical clusteringhc <-hclust(dist_matrix, method ="ward.D2")# Step 4 : Convert to dendrogram dendro <-as.dendrogram(hc)
Create dataframes
This code creates three dataframes necessary for plotting the charts: edges, vertices and connections
Code
#Step 5: Use ggraph hierarchy_graph =as_tbl_graph(dendro)#Hierarchal dataframe edge_df <- hierarchy_graph %>%activate(edges) %>%as_tibble() %>%rename(from = from, to = to)# Extract vertices dataframevertex_df <- hierarchy_graph %>%activate(nodes) %>%as_tibble() %>%mutate(node_id =row_number())vertices_my <- vertex_df %>%mutate(id =row_number(),name = label,shortName = label, # Assuming you don't have a shorter namegroup =ifelse(leaf, "leaf", "internal")) %>%select(id, name, shortName, group, leaf, height, members) %>%arrange(name) %>%mutate(name =factor(name, levels =unique(name)))# Check for NA or empty names and replace them with a placeholdervertices_my <- vertices_my %>%mutate(shortName =ifelse(is.na(shortName) | shortName =="", paste0("Node", row_number()), shortName)) %>%mutate(name =ifelse(is.na(name) | name =="", paste0("Node_", row_number()), name))# Create connections_my dataframeconnections_my_df <- edge_df %>%left_join(vertex_df, by =c("from"="node_id")) %>%left_join(vertex_df, by =c("to"="node_id"), suffix =c("_from", "_to"))connections_my <- connections_my_df %>%select(from, to) %>%#This selects only the 'from' and 'to' columns from the original dataframe.mutate(from = vertices_my$name[from],to = vertices_my$name[to]) %>%#This replaces the values in the 'from' and 'to' columns with corresponding names from the vertices_my dataframe.distinct() #This removes any duplicate rows from the resulting dataframe.connections_my <- connections_my %>%filter(from !=""& to !="") #This keeps only the rows where from and to are not empty strings # Preparation to draw labels properly:# Extract vertex datavertices_my <- hierarchy_graph %>%activate(nodes) %>%as_tibble() %>%mutate(id =row_number(),name = label,shortName = label, # Keep original labelsgroup =ifelse(leaf, "leaf", "internal"),unique_name =make.unique(as.character(name), sep ="_") ) %>%select(id, name, shortName, unique_name, group, leaf, height, members)# Correct any empty or NA names only for internal nodesvertices_my <- vertices_my %>%mutate(name =ifelse(!leaf & (is.na(name) | name ==""), paste0("Node_", row_number()), name),shortName =ifelse(!leaf & (is.na(shortName) | shortName ==""), paste0("Node", row_number()), shortName) )# Prepare label angles for leavesleaf_vertices <- vertices_my %>%filter(leaf) %>%mutate(leaf_id =row_number(),angle =90-360* (leaf_id -1) /n(),hjust =ifelse(angle <-90, 1, 0),angle =ifelse(angle <-90, angle +180, angle) )# Update vertices_my with leaf anglesvertices_my <- vertices_my %>%left_join(leaf_vertices %>%select(id, angle, hjust), by ="id")
The circular dendrogram
This code plots a circular dendrogram from the hierarchical clustering results
The NbClust function is being used to determine the optimal number of clusters in our data. It applies multiple clustering validity indices and methods to suggest the best number of clusters, providing a consensus view based on various criteria to help make a more robust decision about the appropriate cluster count for our dataset.
Code
library(stringr)library(NbClust)methods <-c("ward.D2")for (m in methods) {# Capture all output output <-capture.output({ result <-NbClust(data = scaled_data, distance ="euclidean", min.nc =2, max.nc =10, method = m, index ="all") })# Join the output into a single string output_text <-paste(output, collapse ="\n")# Split the output by the asterisk separator split_output <-str_split(output_text, "\\*{10,}")[[1]]# Check if we have at least 3 partsif (length(split_output) >=3) {# Trim whitespace from the second part second_part <-str_trim(split_output[2])# Split the second part into lines lines <-str_split(second_part, "\n")[[1]]# Format the output formatted_output <-character(0)for (line in lines) {if (str_detect(line, "Conclusion")) { formatted_output <-c(formatted_output, "", "Conclusion", "") } elseif (str_detect(line, "^\\*")) { formatted_output <-c(formatted_output, line) } }# Join the formatted lines formatted_output <-paste(formatted_output, collapse ="\n")# Print the resultscat(paste("Results for method:", m, "\n\n"))cat(formatted_output)cat("\n\n") # Add some space between results for different methods } else {cat(paste("Unexpected output format for method:", m, "\n")) }}
Results for method: ward.D2
Among all indices:
3 proposed 2 as the best number of clusters
11 proposed 3 as the best number of clusters
2 proposed 5 as the best number of clusters
7 proposed 10 as the best number of clusters
Conclusion
According to the majority rule, the best number of clusters is 3
Plot the clusters
Code
cluster_groups <-cutree(hc, k =3)# Add cluster information to vertices_myvertices_my <- vertices_my %>%mutate(cluster =factor(cluster_groups[match(name, names(cluster_groups))]))library(RColorBrewer)# Select three colors from the "Paired" palettecluster_colors <-brewer.pal(3, "Paired")#Create the graph object mygraph <-graph_from_data_frame(edge_df, vertices = vertices_my)#Correct the connections_my dataframe with valid connections # Get leaf nodesleaf_nodes <-V(mygraph)$name[V(mygraph)$leaf]# Create all possible combinations of leaf nodesall_combinations <-expand.grid(from = leaf_nodes, to = leaf_nodes) %>%mutate(from =as.character(from),to =as.character(to) )# Remove self-connections and duplicate connectionsconnections_my_corrected <- all_combinations %>%filter(from != to) %>%# Remove self-connectionsmutate(pair =pmin(from, to),pair_to =pmax(from, to) ) %>%distinct(pair, pair_to) %>%# Remove duplicatesselect(from = pair, to = pair_to)# Convert node names to indicesfrom <-match(connections_my_corrected$from, V(mygraph)$name)to <-match(connections_my_corrected$to, V(mygraph)$name)# Plot the hierarchical edge bundle with colored pointsh2 =ggraph(mygraph, layout ='dendrogram', circular =TRUE) +geom_conn_bundle(data =get_con(from = from, to = to), tension =1.5,aes(colour=after_stat(index))) +scale_edge_colour_distiller(palette ="RdPu") +geom_node_point(aes(filter = leaf, color = cluster), # Move points slightly outwardsize =2, alpha =1) +geom_node_text(aes(x = x *1.05, y = y *1.05, # Move text even further outwardfilter = leaf, label = shortName, angle = angle, hjust = hjust), size =2, alpha =1) +scale_color_manual(values = cluster_colors) +coord_fixed() +theme_void() +theme(legend.position ="none",plot.margin =unit(c(0,0,0,0), "cm"), ) +expand_limits(x =c(-1.4, 1.4), y =c(-1.4, 1.4)) # Further expanded limits to accommodate text and pointsh2
This code loads the male data, standardizes the data to one unit variance across sampled groups, calculates euclidean distances, performs hierarchical clustering and creates a dendrogram.
Code
# Step 1: Load and standardize the datadata <-read_excel(here::here("Data/Mol_raw_data.xlsx"), sheet =2)data <-as.data.frame(data)# Ensure the first column becomes the row namesrownames(data) <- data[,1]data <- data[,-1]#Standardise the data scaled_data <- vegan::decostand(data, method="standardize", MARGIN =1)# Step 2: Calculate Euclidean distancesdist_matrix <-dist(scaled_data, method ="euclidean")# Step 3: Apply hierarchical clusteringhc <-hclust(dist_matrix, method ="ward.D2")# Step 4 : Convert to dendrogram dendro <-as.dendrogram(hc)
Create dataframes
This code creates three dataframes necessary for plotting the charts: edges, vertices and connections
Code
#Step 5: Use ggraph hierarchy_graph =as_tbl_graph(dendro)#Hierarchal dataframe edge_df <- hierarchy_graph %>%activate(edges) %>%as_tibble() %>%rename(from = from, to = to)# Extract vertices dataframevertex_df <- hierarchy_graph %>%activate(nodes) %>%as_tibble() %>%mutate(node_id =row_number())vertices_my <- vertex_df %>%mutate(id =row_number(),name = label,shortName = label, # Assuming you don't have a shorter namegroup =ifelse(leaf, "leaf", "internal")) %>%select(id, name, shortName, group, leaf, height, members) %>%arrange(name) %>%mutate(name =factor(name, levels =unique(name)))# Check for NA or empty names and replace them with a placeholdervertices_my <- vertices_my %>%mutate(shortName =ifelse(is.na(shortName) | shortName =="", paste0("Node", row_number()), shortName)) %>%mutate(name =ifelse(is.na(name) | name =="", paste0("Node_", row_number()), name))# Create connections_my dataframeconnections_my_df <- edge_df %>%left_join(vertex_df, by =c("from"="node_id")) %>%left_join(vertex_df, by =c("to"="node_id"), suffix =c("_from", "_to"))connections_my <- connections_my_df %>%select(from, to) %>%#This selects only the 'from' and 'to' columns from the original dataframe.mutate(from = vertices_my$name[from],to = vertices_my$name[to]) %>%#This replaces the values in the 'from' and 'to' columns with corresponding names from the vertices_my dataframe.distinct() #This removes any duplicate rows from the resulting dataframe.connections_my <- connections_my %>%filter(from !=""& to !="") #This keeps only the rows where from and to are not empty strings # Preparation to draw labels properly:# Extract vertex datavertices_my <- hierarchy_graph %>%activate(nodes) %>%as_tibble() %>%mutate(id =row_number(),name = label,shortName = label, # Keep original labelsgroup =ifelse(leaf, "leaf", "internal"),unique_name =make.unique(as.character(name), sep ="_") ) %>%select(id, name, shortName, unique_name, group, leaf, height, members)# Correct any empty or NA names only for internal nodesvertices_my <- vertices_my %>%mutate(name =ifelse(!leaf & (is.na(name) | name ==""), paste0("Node_", row_number()), name),shortName =ifelse(!leaf & (is.na(shortName) | shortName ==""), paste0("Node", row_number()), shortName) )# Prepare label angles for leavesleaf_vertices <- vertices_my %>%filter(leaf) %>%mutate(leaf_id =row_number(),angle =90-360* (leaf_id -1) /n(),hjust =ifelse(angle <-90, 1, 0),angle =ifelse(angle <-90, angle +180, angle) )# Update vertices_my with leaf anglesvertices_my <- vertices_my %>%left_join(leaf_vertices %>%select(id, angle, hjust), by ="id")
The circular dendrogram
This code plots a circular dendrogram from the hierarchical clustering results
The NbClust function is being used to determine the optimal number of clusters in our data. It applies multiple clustering validity indices and methods to suggest the best number of clusters, providing a consensus view based on various criteria to help make a more robust decision about the appropriate cluster count for our dataset.
Code
library(stringr)library(NbClust)methods <-c("ward.D2")for (m in methods) {# Capture all output output <-capture.output({ result <-NbClust(data = scaled_data, distance ="euclidean", min.nc =2, max.nc =10, method = m, index ="all") })# Join the output into a single string output_text <-paste(output, collapse ="\n")# Split the output by the asterisk separator split_output <-str_split(output_text, "\\*{10,}")[[1]]# Check if we have at least 3 partsif (length(split_output) >=3) {# Trim whitespace from the second part second_part <-str_trim(split_output[2])# Split the second part into lines lines <-str_split(second_part, "\n")[[1]]# Format the output formatted_output <-character(0)for (line in lines) {if (str_detect(line, "Conclusion")) { formatted_output <-c(formatted_output, "", "Conclusion", "") } elseif (str_detect(line, "^\\*")) { formatted_output <-c(formatted_output, line) } }# Join the formatted lines formatted_output <-paste(formatted_output, collapse ="\n")# Print the resultscat(paste("Results for method:", m, "\n\n"))cat(formatted_output)cat("\n\n") # Add some space between results for different methods } else {cat(paste("Unexpected output format for method:", m, "\n")) }}
Results for method: ward.D2
Among all indices:
8 proposed 2 as the best number of clusters
6 proposed 3 as the best number of clusters
1 proposed 4 as the best number of clusters
2 proposed 5 as the best number of clusters
7 proposed 10 as the best number of clusters
Conclusion
According to the majority rule, the best number of clusters is 2
Plot the clusters
Code
cluster_groups <-cutree(hc, k =2)# Add cluster information to vertices_myvertices_my <- vertices_my %>%mutate(cluster =factor(cluster_groups[match(name, names(cluster_groups))]))library(RColorBrewer)# Select three colors from the "Paired" palettecluster_colors <-brewer.pal(2, "Paired")#Create the graph object mygraph <-graph_from_data_frame(edge_df, vertices = vertices_my)#Correct the connections_my dataframe with valid connections # Get leaf nodesleaf_nodes <-V(mygraph)$name[V(mygraph)$leaf]# Create all possible combinations of leaf nodesall_combinations <-expand.grid(from = leaf_nodes, to = leaf_nodes) %>%mutate(from =as.character(from),to =as.character(to) )# Remove self-connections and duplicate connectionsconnections_my_corrected <- all_combinations %>%filter(from != to) %>%# Remove self-connectionsmutate(pair =pmin(from, to),pair_to =pmax(from, to) ) %>%distinct(pair, pair_to) %>%# Remove duplicatesselect(from = pair, to = pair_to)# Convert node names to indicesfrom <-match(connections_my_corrected$from, V(mygraph)$name)to <-match(connections_my_corrected$to, V(mygraph)$name)# Plot the hierarchical edge bundle with colored pointsh4 =ggraph(mygraph, layout ='dendrogram', circular =TRUE) +geom_conn_bundle(data =get_con(from = from, to = to), tension =1.5,aes(colour=after_stat(index))) +scale_edge_colour_distiller(palette ="BuPu") +geom_node_point(aes(filter = leaf, color = cluster, x = x, y = y), # Move points slightly outwardsize =2, alpha =1) +geom_node_text(aes(x = x *1.05, y = y *1.05, # Move text even further outwardfilter = leaf, label = shortName, angle = angle, hjust = hjust), size =2.2, alpha =1) +scale_color_manual(values = cluster_colors) +coord_fixed() +theme_void() +theme(legend.position ="none",plot.margin =unit(c(0,0,0,0), "cm"), ) +expand_limits(x =c(-1.4, 1.4), y =c(-1.4, 1.4)) # Further expanded limits to accommodate text and pointsh4
Molecular analytes and cardiometabolic health markers
Female data
Load data and create the dendrogram
This code loads the female data, standardizes the data to one unit variance across sampled groups, calculates euclidean distances, performs hierarchical clustering and creates a dendrogram.
Code
# Step 1: Load and standardize the datadata <-read_excel(here::here("Data/Mol_raw_data.xlsx"), sheet =3)data <-as.data.frame(data)# Ensure the first column becomes the row namesrownames(data) <- data[,1]data <- data[,-1]#Standardise the data scaled_data <- vegan::decostand(data, method="standardize", MARGIN =1)# Step 2: Calculate Euclidean distancesdist_matrix <-dist(scaled_data, method ="euclidean")# Step 3: Apply hierarchical clusteringhc <-hclust(dist_matrix, method ="ward.D2")# Step 4 : Convert to dendrogram dendro <-as.dendrogram(hc)
Create dataframes
This code creates three dataframes necessary for plotting the charts: edges, vertices and connections
Code
#Step 5: Use ggraph hierarchy_graph =as_tbl_graph(dendro)#Hierarchal dataframe edge_df <- hierarchy_graph %>%activate(edges) %>%as_tibble() %>%rename(from = from, to = to)# Extract vertices dataframevertex_df <- hierarchy_graph %>%activate(nodes) %>%as_tibble() %>%mutate(node_id =row_number())vertices_my <- vertex_df %>%mutate(id =row_number(),name = label,shortName = label, # Assuming you don't have a shorter namegroup =ifelse(leaf, "leaf", "internal")) %>%select(id, name, shortName, group, leaf, height, members) %>%arrange(name) %>%mutate(name =factor(name, levels =unique(name)))# Check for NA or empty names and replace them with a placeholdervertices_my <- vertices_my %>%mutate(shortName =ifelse(is.na(shortName) | shortName =="", paste0("Node", row_number()), shortName)) %>%mutate(name =ifelse(is.na(name) | name =="", paste0("Node_", row_number()), name))# Create connections_my dataframeconnections_my_df <- edge_df %>%left_join(vertex_df, by =c("from"="node_id")) %>%left_join(vertex_df, by =c("to"="node_id"), suffix =c("_from", "_to"))connections_my <- connections_my_df %>%select(from, to) %>%#This selects only the 'from' and 'to' columns from the original dataframe.mutate(from = vertices_my$name[from],to = vertices_my$name[to]) %>%#This replaces the values in the 'from' and 'to' columns with corresponding names from the vertices_my dataframe.distinct() #This removes any duplicate rows from the resulting dataframe.connections_my <- connections_my %>%filter(from !=""& to !="") #This keeps only the rows where from and to are not empty strings # Preparation to draw labels properly:# Extract vertex datavertices_my <- hierarchy_graph %>%activate(nodes) %>%as_tibble() %>%mutate(id =row_number(),name = label,shortName = label, # Keep original labelsgroup =ifelse(leaf, "leaf", "internal"),unique_name =make.unique(as.character(name), sep ="_") ) %>%select(id, name, shortName, unique_name, group, leaf, height, members)# Correct any empty or NA names only for internal nodesvertices_my <- vertices_my %>%mutate(name =ifelse(!leaf & (is.na(name) | name ==""), paste0("Node_", row_number()), name),shortName =ifelse(!leaf & (is.na(shortName) | shortName ==""), paste0("Node", row_number()), shortName) )# Prepare label angles for leavesleaf_vertices <- vertices_my %>%filter(leaf) %>%mutate(leaf_id =row_number(),angle =90-360* (leaf_id -1) /n(),hjust =ifelse(angle <-90, 1, 0),angle =ifelse(angle <-90, angle +180, angle) )# Update vertices_my with leaf anglesvertices_my <- vertices_my %>%left_join(leaf_vertices %>%select(id, angle, hjust), by ="id")
The circular dendrogram
This code plots a circular dendrogram from the hierarchical clustering results
This code plots a hierarchical edge bundling chart from the hierarchy defined in the dendrogram highlighting which markers are molecular analytes and which are cardiometabolic health markers
Code
# Step 1: Load and standardize the datamarkers <-read_excel(here::here("Data/Mol_raw_data.xlsx"), sheet =5)markers_females=markers[,c(2:3)]library(dplyr)vertices_my <- vertices_my %>%left_join(markers_females, by =c("name"="Females")) %>%rename(Markers = Type) %>%mutate(Markers =factor(Markers))# Define colors for the two groupsmarker_levels <-levels(vertices_my$Markers)group_colors <-setNames(c("black", "#D2B48C"), marker_levels)mygraph <-graph_from_data_frame(edge_df, vertices = vertices_my)#Correct the connections_my dataframe with valid connections # Get leaf nodesleaf_nodes <-V(mygraph)$name[V(mygraph)$leaf]# Create all possible combinations of leaf nodesall_combinations <-expand.grid(from = leaf_nodes, to = leaf_nodes) %>%mutate(from =as.character(from),to =as.character(to) )# Remove self-connections and duplicate connectionsconnections_my_corrected <- all_combinations %>%filter(from != to) %>%# Remove self-connectionsmutate(pair =pmin(from, to),pair_to =pmax(from, to) ) %>%distinct(pair, pair_to) %>%# Remove duplicatesselect(from = pair, to = pair_to)# Convert node names to indicesfrom <-match(connections_my_corrected$from, V(mygraph)$name)to <-match(connections_my_corrected$to, V(mygraph)$name)# Plot the hierarchical edge bundle with colored pointsh6 =ggraph(mygraph, layout ='dendrogram', circular =TRUE) +geom_conn_bundle(data =get_con(from = from, to = to), tension =1.5,aes(colour =after_stat(index)),show.legend =FALSE) +# Hide edge color legendscale_edge_colour_distiller(palette ="RdPu") +geom_node_point(aes(filter = leaf, color = Markers # Use 'group' for coloring ),size =2, alpha =1) +geom_node_text(aes(x = x *1.05, y = y *1.05,filter = leaf, label = shortName, angle = angle, hjust = hjust), size =1.8, alpha =1) +scale_color_manual(values = group_colors, name =NULL) +# Name the legendcoord_fixed() +theme_void() +theme(legend.position ="bottom",legend.text =element_text(size =6),plot.margin =unit(c(0,0,0,0),"cm"), ) +expand_limits(x =c(-1.4, 1.4), y =c(-1.4, 1.4))h6
The NbClust function is being used to determine the optimal number of clusters in our data. It applies multiple clustering validity indices and methods to suggest the best number of clusters, providing a consensus view based on various criteria to help make a more robust decision about the appropriate cluster count for our dataset.
Code
library(stringr)library(NbClust)methods <-c("ward.D2")for (m in methods) {# Capture all output output <-capture.output({ result <-NbClust(data = scaled_data, distance ="euclidean", min.nc =2, max.nc =10, method = m, index ="all") })# Join the output into a single string output_text <-paste(output, collapse ="\n")# Split the output by the asterisk separator split_output <-str_split(output_text, "\\*{10,}")[[1]]# Check if we have at least 3 partsif (length(split_output) >=3) {# Trim whitespace from the second part second_part <-str_trim(split_output[2])# Split the second part into lines lines <-str_split(second_part, "\n")[[1]]# Format the output formatted_output <-character(0)for (line in lines) {if (str_detect(line, "Conclusion")) { formatted_output <-c(formatted_output, "", "Conclusion", "") } elseif (str_detect(line, "^\\*")) { formatted_output <-c(formatted_output, line) } }# Join the formatted lines formatted_output <-paste(formatted_output, collapse ="\n")# Print the resultscat(paste("Results for method:", m, "\n\n"))cat(formatted_output)cat("\n\n") # Add some space between results for different methods } else {cat(paste("Unexpected output format for method:", m, "\n")) }}
Results for method: ward.D2
Among all indices:
4 proposed 2 as the best number of clusters
8 proposed 3 as the best number of clusters
1 proposed 4 as the best number of clusters
2 proposed 5 as the best number of clusters
1 proposed 8 as the best number of clusters
1 proposed 9 as the best number of clusters
6 proposed 10 as the best number of clusters
Conclusion
According to the majority rule, the best number of clusters is 3
Plot the clusters
Code
cluster_groups <-cutree(hc, k =3)# Add cluster information to vertices_myvertices_my <- vertices_my %>%mutate(cluster =factor(cluster_groups[match(name, names(cluster_groups))]))library(RColorBrewer)# Select three colors from the "Paired" palettecluster_colors <-brewer.pal(3, "Paired")#Create the graph object mygraph <-graph_from_data_frame(edge_df, vertices = vertices_my)#Correct the connections_my dataframe with valid connections # Get leaf nodesleaf_nodes <-V(mygraph)$name[V(mygraph)$leaf]# Create all possible combinations of leaf nodesall_combinations <-expand.grid(from = leaf_nodes, to = leaf_nodes) %>%mutate(from =as.character(from),to =as.character(to) )# Remove self-connections and duplicate connectionsconnections_my_corrected <- all_combinations %>%filter(from != to) %>%# Remove self-connectionsmutate(pair =pmin(from, to),pair_to =pmax(from, to) ) %>%distinct(pair, pair_to) %>%# Remove duplicatesselect(from = pair, to = pair_to)# Convert node names to indicesfrom <-match(connections_my_corrected$from, V(mygraph)$name)to <-match(connections_my_corrected$to, V(mygraph)$name)# Plot the hierarchical edge bundle with colored pointsh5b =ggraph(mygraph, layout ='dendrogram', circular =TRUE) +geom_conn_bundle(data =get_con(from = from, to = to), tension =1.5,aes(colour=after_stat(index))) +scale_edge_colour_distiller(palette ="RdPu") +geom_node_point(aes(filter = leaf, color = cluster, x = x *1.07, y = y *1.07), # Move points slightly outwardsize =3, alpha =0.6) +geom_node_text(aes(x = x *1.14, y = y *1.14, # Move text even further outwardfilter = leaf, label = shortName, angle = angle, hjust = hjust), size =1.5, alpha =1) +scale_color_manual(values = cluster_colors) +coord_fixed() +theme_void() +theme(legend.position ="none",plot.margin =unit(c(0,0,0,0), "cm"), ) +expand_limits(x =c(-1.4, 1.4), y =c(-1.4, 1.4)) # Further expanded limits to accommodate text and pointsh5b
This code loads the male data, standardizes the data to one unit variance across sampled groups, calculates euclidean distances, performs hierarchical clustering and creates a dendrogram.
Code
# Step 1: Load and standardize the datadata <-read_excel(here::here("Data/Mol_raw_data.xlsx"), sheet =4)data <-as.data.frame(data)# Ensure the first column becomes the row namesrownames(data) <- data[,1]data <- data[,-1]#Standardise the data scaled_data <- vegan::decostand(data, method="standardize", MARGIN =1)# Step 2: Calculate Euclidean distancesdist_matrix <-dist(scaled_data, method ="euclidean")# Step 3: Apply hierarchical clusteringhc <-hclust(dist_matrix, method ="ward.D2")# Step 4 : Convert to dendrogram dendro <-as.dendrogram(hc)
Create dataframes
This code creates three dataframes necessary for plotting the charts: edges, vertices and connections
Code
#Step 5: Use ggraph hierarchy_graph =as_tbl_graph(dendro)#Hierarchal dataframe edge_df <- hierarchy_graph %>%activate(edges) %>%as_tibble() %>%rename(from = from, to = to)# Extract vertices dataframevertex_df <- hierarchy_graph %>%activate(nodes) %>%as_tibble() %>%mutate(node_id =row_number())vertices_my <- vertex_df %>%mutate(id =row_number(),name = label,shortName = label, # Assuming you don't have a shorter namegroup =ifelse(leaf, "leaf", "internal")) %>%select(id, name, shortName, group, leaf, height, members) %>%arrange(name) %>%mutate(name =factor(name, levels =unique(name)))# Check for NA or empty names and replace them with a placeholdervertices_my <- vertices_my %>%mutate(shortName =ifelse(is.na(shortName) | shortName =="", paste0("Node", row_number()), shortName)) %>%mutate(name =ifelse(is.na(name) | name =="", paste0("Node_", row_number()), name))# Create connections_my dataframeconnections_my_df <- edge_df %>%left_join(vertex_df, by =c("from"="node_id")) %>%left_join(vertex_df, by =c("to"="node_id"), suffix =c("_from", "_to"))connections_my <- connections_my_df %>%select(from, to) %>%#This selects only the 'from' and 'to' columns from the original dataframe.mutate(from = vertices_my$name[from],to = vertices_my$name[to]) %>%#This replaces the values in the 'from' and 'to' columns with corresponding names from the vertices_my dataframe.distinct() #This removes any duplicate rows from the resulting dataframe.connections_my <- connections_my %>%filter(from !=""& to !="") #This keeps only the rows where from and to are not empty strings # Preparation to draw labels properly:# Extract vertex datavertices_my <- hierarchy_graph %>%activate(nodes) %>%as_tibble() %>%mutate(id =row_number(),name = label,shortName = label, # Keep original labelsgroup =ifelse(leaf, "leaf", "internal"),unique_name =make.unique(as.character(name), sep ="_") ) %>%select(id, name, shortName, unique_name, group, leaf, height, members)# Correct any empty or NA names only for internal nodesvertices_my <- vertices_my %>%mutate(name =ifelse(!leaf & (is.na(name) | name ==""), paste0("Node_", row_number()), name),shortName =ifelse(!leaf & (is.na(shortName) | shortName ==""), paste0("Node", row_number()), shortName) )# Prepare label angles for leavesleaf_vertices <- vertices_my %>%filter(leaf) %>%mutate(leaf_id =row_number(),angle =90-360* (leaf_id -1) /n(),hjust =ifelse(angle <-90, 1, 0),angle =ifelse(angle <-90, angle +180, angle) )# Update vertices_my with leaf anglesvertices_my <- vertices_my %>%left_join(leaf_vertices %>%select(id, angle, hjust), by ="id")
The circular dendrogram
This code plots a circular dendrogram from the hierarchical clustering results
This code plots a hierarchical edge bundling chart from the hierarchy defined in the dendrogram highlighting which markers are molecular analytes and which are cardiometabolic health markers
Code
# Step 1: Load and standardize the datamarkers <-read_excel(here::here("Data/Mol_raw_data.xlsx"), sheet =5)markers_males=markers[,c(1:3)]library(dplyr)vertices_my <- vertices_my %>%left_join(markers_males, by =c("name"="Males")) %>%rename(Markers = Type) %>%mutate(Markers =factor(Markers))# Define colors for the two groupsmarker_levels <-levels(vertices_my$Markers)group_colors <-setNames(c("black", "#D2B48C"), marker_levels)mygraph <-graph_from_data_frame(edge_df, vertices = vertices_my)#Correct the connections_my dataframe with valid connections # Get leaf nodesleaf_nodes <-V(mygraph)$name[V(mygraph)$leaf]# Create all possible combinations of leaf nodesall_combinations <-expand.grid(from = leaf_nodes, to = leaf_nodes) %>%mutate(from =as.character(from),to =as.character(to) )# Remove self-connections and duplicate connectionsconnections_my_corrected <- all_combinations %>%filter(from != to) %>%# Remove self-connectionsmutate(pair =pmin(from, to),pair_to =pmax(from, to) ) %>%distinct(pair, pair_to) %>%# Remove duplicatesselect(from = pair, to = pair_to)# Convert node names to indicesfrom <-match(connections_my_corrected$from, V(mygraph)$name)to <-match(connections_my_corrected$to, V(mygraph)$name)# Plot the hierarchical edge bundle with colored pointsh8 =ggraph(mygraph, layout ='dendrogram', circular =TRUE) +geom_conn_bundle(data =get_con(from = from, to = to), tension =1.5,aes(colour =after_stat(index)),show.legend =FALSE) +# Hide edge color legendscale_edge_colour_distiller(palette ="BuPu") +geom_node_point(aes(filter = leaf, color = Markers # Use 'group' for coloring ),size =2, alpha =1) +geom_node_text(aes(x = x *1.05, y = y *1.05,filter = leaf, label = shortName, angle = angle, hjust = hjust), size =1.8, alpha =1) +scale_color_manual(values = group_colors, name =NULL) +# Name the legendcoord_fixed() +theme_void() +theme(legend.position ="bottom",legend.text =element_text(size =6),plot.margin =unit(c(0,0,0,0),"cm"), ) +expand_limits(x =c(-1.4, 1.4), y =c(-1.4, 1.4))h8
The NbClust function is being used to determine the optimal number of clusters in our data. It applies multiple clustering validity indices and methods to suggest the best number of clusters, providing a consensus view based on various criteria to help make a more robust decision about the appropriate cluster count for our dataset.
# Extract working indices and their resultsworking_results <- results %>%keep(~is.null(.$error)) %>%map(~ .$result)# Combine resultsif (length(working_results) >0) { combined_result <-list(All.index =do.call(c, map(working_results, ~ .$All.index)),Best.nc =do.call(rbind, map(working_results, ~ .$Best.nc)),Best.partition = working_results[[1]]$Best.partition )# Count the proposed best number of clusters best_nc_counts <-table(combined_result$Best.nc[, 1])# Print results in the requested formatcat("Results for method: ward.D2\n\n")cat("Among all indices:\n\n")for (nc insort(as.numeric(names(best_nc_counts)))) { count <- best_nc_counts[as.character(nc)]cat(sprintf("* %d proposed %d as the best number of clusters\n", count, nc)) }cat("\nConclusion\n\n") best_nc <-as.numeric(names(which.max(best_nc_counts)))cat(sprintf("According to the majority rule, the best number of clusters is %d\n", best_nc))} else {cat("No indices worked successfully.\n")}
Results for method: ward.D2
Among all indices:
6 proposed 2 as the best number of clusters
3 proposed 3 as the best number of clusters
1 proposed 4 as the best number of clusters
2 proposed 5 as the best number of clusters
1 proposed 7 as the best number of clusters
7 proposed 10 as the best number of clusters
Conclusion
According to the majority rule, the best number of clusters is 10
Plot the clusters
Code
cluster_groups <-cutree(hc, k =10)# Add cluster information to vertices_myvertices_my <- vertices_my %>%mutate(cluster =factor(cluster_groups[match(name, names(cluster_groups))]))library(RColorBrewer)# Select three colors from the "Paired" palettecluster_colors <-brewer.pal(10, "Paired")#Create the graph object mygraph <-graph_from_data_frame(edge_df, vertices = vertices_my)#Correct the connections_my dataframe with valid connections # Get leaf nodesleaf_nodes <-V(mygraph)$name[V(mygraph)$leaf]# Create all possible combinations of leaf nodesall_combinations <-expand.grid(from = leaf_nodes, to = leaf_nodes) %>%mutate(from =as.character(from),to =as.character(to) )# Remove self-connections and duplicate connectionsconnections_my_corrected <- all_combinations %>%filter(from != to) %>%# Remove self-connectionsmutate(pair =pmin(from, to),pair_to =pmax(from, to) ) %>%distinct(pair, pair_to) %>%# Remove duplicatesselect(from = pair, to = pair_to)# Convert node names to indicesfrom <-match(connections_my_corrected$from, V(mygraph)$name)to <-match(connections_my_corrected$to, V(mygraph)$name)# Plot the hierarchical edge bundle with colored pointsh9 =ggraph(mygraph, layout ='dendrogram', circular =TRUE) +geom_conn_bundle(data =get_con(from = from, to = to), tension =1.5,aes(colour=after_stat(index))) +scale_edge_colour_distiller(palette ="BuPu") +geom_node_point(aes(filter = leaf, color = cluster, x = x *1.07, y = y *1.07), # Move points slightly outwardsize =3, alpha =0.6) +geom_node_text(aes(x = x *1.14, y = y *1.14, # Move text even further outwardfilter = leaf, label = shortName, angle = angle, hjust = hjust), size =1.6, alpha =1) +scale_color_manual(values = cluster_colors) +coord_fixed() +theme_void() +theme(legend.position ="none",plot.margin =unit(c(0,0,0,0), "cm"), ) +expand_limits(x =c(-1.4, 1.4), y =c(-1.4, 1.4)) # Further expanded limits to accommodate text and pointsh9
---title: "Hierarchical Edge Bundling"author: "Théophile L. Mouton"date: "2024-07-22"format: htmleditor: visualcode-fold: truecode-tools: truetoc: truetoc-location: rightcss: custom.cssself-contained: trueexecute: warning: false results: hide---Constructing Hierarchical Edge Bundling charts for molecular data.# Molecular analytes## Female data### Load required libraries, data and create the dendrogramThis code loads necessary R packages, loads the female data, standardizes the data to one unit variance across sampled groups, calculates euclidean distances, performs hierarchical clustering and creates a dendrogram.```{r}# Load required librarieslibrary(readxl)library(dplyr)library(stats)library(ggplot2)library(ggdendro)library(igraph)library(ggraph)library(tidygraph)# Step 1: Load and standardize the datadata <-read_excel(here::here("Data/Mol_raw_data.xlsx"), sheet =1)data <-as.data.frame(data)# Ensure the first column becomes the row namesrownames(data) <- data[,1]data <- data[,-1]#Standardise the data scaled_data <- vegan::decostand(data, method="standardize", MARGIN =1)# Step 2: Calculate Euclidean distancesdist_matrix <-dist(scaled_data, method ="euclidean")# Step 3: Apply hierarchical clusteringhc <-hclust(dist_matrix, method ="ward.D2")# Step 4 : Convert to dendrogram dendro <-as.dendrogram(hc)```### Create dataframesThis code creates three dataframes necessary for plotting the charts: edges, vertices and connections```{r}#Step 5: Use ggraph hierarchy_graph =as_tbl_graph(dendro)#Hierarchal dataframe edge_df <- hierarchy_graph %>%activate(edges) %>%as_tibble() %>%rename(from = from, to = to)# Extract vertices dataframevertex_df <- hierarchy_graph %>%activate(nodes) %>%as_tibble() %>%mutate(node_id =row_number())vertices_my <- vertex_df %>%mutate(id =row_number(),name = label,shortName = label, # Assuming you don't have a shorter namegroup =ifelse(leaf, "leaf", "internal")) %>%select(id, name, shortName, group, leaf, height, members) %>%arrange(name) %>%mutate(name =factor(name, levels =unique(name)))# Check for NA or empty names and replace them with a placeholdervertices_my <- vertices_my %>%mutate(shortName =ifelse(is.na(shortName) | shortName =="", paste0("Node", row_number()), shortName)) %>%mutate(name =ifelse(is.na(name) | name =="", paste0("Node_", row_number()), name))# Create connections_my dataframeconnections_my_df <- edge_df %>%left_join(vertex_df, by =c("from"="node_id")) %>%left_join(vertex_df, by =c("to"="node_id"), suffix =c("_from", "_to"))connections_my <- connections_my_df %>%select(from, to) %>%#This selects only the 'from' and 'to' columns from the original dataframe.mutate(from = vertices_my$name[from],to = vertices_my$name[to]) %>%#This replaces the values in the 'from' and 'to' columns with corresponding names from the vertices_my dataframe.distinct() #This removes any duplicate rows from the resulting dataframe.connections_my <- connections_my %>%filter(from !=""& to !="") #This keeps only the rows where from and to are not empty strings # Preparation to draw labels properly:# Extract vertex datavertices_my <- hierarchy_graph %>%activate(nodes) %>%as_tibble() %>%mutate(id =row_number(),name = label,shortName = label, # Keep original labelsgroup =ifelse(leaf, "leaf", "internal"),unique_name =make.unique(as.character(name), sep ="_") ) %>%select(id, name, shortName, unique_name, group, leaf, height, members)# Correct any empty or NA names only for internal nodesvertices_my <- vertices_my %>%mutate(name =ifelse(!leaf & (is.na(name) | name ==""), paste0("Node_", row_number()), name),shortName =ifelse(!leaf & (is.na(shortName) | shortName ==""), paste0("Node", row_number()), shortName) )# Prepare label angles for leavesleaf_vertices <- vertices_my %>%filter(leaf) %>%mutate(leaf_id =row_number(),angle =90-360* (leaf_id -1) /n(),hjust =ifelse(angle <-90, 1, 0),angle =ifelse(angle <-90, angle +180, angle) )# Update vertices_my with leaf anglesvertices_my <- vertices_my %>%left_join(leaf_vertices %>%select(id, angle, hjust), by ="id")```### The circular dendrogramThis code plots a circular dendrogram from the hierarchical clustering results```{r}#Create the graph object mygraph <-graph_from_data_frame(edge_df, vertices = vertices_my)# Basic dendrogram ----ggraph(mygraph, layout ='dendrogram', circular =TRUE) +geom_edge_link(size =0.4, alpha =0.1) +geom_node_text(aes(x = x*1.01, y = y*1.01, filter = leaf, label = shortName, angle = angle, hjust = hjust), size =1.5, alpha =1) +coord_fixed() +theme_void() +theme(legend.position ="none",plot.margin =unit(c(0,0,0,0),"cm"), ) +expand_limits(x =c(-1.2, 1.2), y =c(-1.2, 1.2))```### The hierarchical edge bundlingThis code plots a hierarchical edge bundling chart from the hierarchy defined in the dendrogram```{r}#Correct the connections_my dataframe with valid connections # Get leaf nodesleaf_nodes <-V(mygraph)$name[V(mygraph)$leaf]# Create all possible combinations of leaf nodesall_combinations <-expand.grid(from = leaf_nodes, to = leaf_nodes) %>%mutate(from =as.character(from),to =as.character(to) )# Remove self-connections and duplicate connectionsconnections_my_corrected <- all_combinations %>%filter(from != to) %>%# Remove self-connectionsmutate(pair =pmin(from, to),pair_to =pmax(from, to) ) %>%distinct(pair, pair_to) %>%# Remove duplicatesselect(from = pair, to = pair_to)# Convert node names to indicesfrom <-match(connections_my_corrected$from, V(mygraph)$name)to <-match(connections_my_corrected$to, V(mygraph)$name)# Plot the hierarchical edge bundle h1 =ggraph(mygraph, layout ='dendrogram', circular =TRUE) +geom_conn_bundle(data =get_con(from = from, to = to), alpha =0.2, # Reduced alpha for better visibility# colour="#69b3a2", tension =1.5,aes(colour=after_stat(index))) +scale_edge_colour_distiller(palette ="RdPu") +geom_node_point(aes(filter = leaf), size =2, color ="black") +geom_node_text(aes(x = x*1.05, y=y*1.05, filter = leaf, label=shortName, angle = angle, hjust=hjust), size=2, alpha=1) +coord_fixed() +theme_void() +theme(legend.position="none",plot.margin=unit(c(0,0,0,0),"cm"), ) +expand_limits(x =c(-1.4, 1.4), y =c(-1.4, 1.4))h1 ggsave(here::here("Outputs", "Second figures", "HEB_1G_females.pdf"), h1, device ="pdf", width =88, height =88, units ="mm")```### NbclustThe NbClust function is being used to determine the optimal number of clusters in our data. It applies multiple clustering validity indices and methods to suggest the best number of clusters, providing a consensus view based on various criteria to help make a more robust decision about the appropriate cluster count for our dataset.```{r, results='asis', fig.show='hide', message=FALSE, warning=FALSE}library(stringr)library(NbClust)methods <-c("ward.D2")for (m in methods) {# Capture all output output <-capture.output({ result <-NbClust(data = scaled_data, distance ="euclidean", min.nc =2, max.nc =10, method = m, index ="all") })# Join the output into a single string output_text <-paste(output, collapse ="\n")# Split the output by the asterisk separator split_output <-str_split(output_text, "\\*{10,}")[[1]]# Check if we have at least 3 partsif (length(split_output) >=3) {# Trim whitespace from the second part second_part <-str_trim(split_output[2])# Split the second part into lines lines <-str_split(second_part, "\n")[[1]]# Format the output formatted_output <-character(0)for (line in lines) {if (str_detect(line, "Conclusion")) { formatted_output <-c(formatted_output, "", "Conclusion", "") } elseif (str_detect(line, "^\\*")) { formatted_output <-c(formatted_output, line) } }# Join the formatted lines formatted_output <-paste(formatted_output, collapse ="\n")# Print the resultscat(paste("Results for method:", m, "\n\n"))cat(formatted_output)cat("\n\n") # Add some space between results for different methods } else {cat(paste("Unexpected output format for method:", m, "\n")) }}```### Plot the clusters ```{r}cluster_groups <-cutree(hc, k =3)# Add cluster information to vertices_myvertices_my <- vertices_my %>%mutate(cluster =factor(cluster_groups[match(name, names(cluster_groups))]))library(RColorBrewer)# Select three colors from the "Paired" palettecluster_colors <-brewer.pal(3, "Paired")#Create the graph object mygraph <-graph_from_data_frame(edge_df, vertices = vertices_my)#Correct the connections_my dataframe with valid connections # Get leaf nodesleaf_nodes <-V(mygraph)$name[V(mygraph)$leaf]# Create all possible combinations of leaf nodesall_combinations <-expand.grid(from = leaf_nodes, to = leaf_nodes) %>%mutate(from =as.character(from),to =as.character(to) )# Remove self-connections and duplicate connectionsconnections_my_corrected <- all_combinations %>%filter(from != to) %>%# Remove self-connectionsmutate(pair =pmin(from, to),pair_to =pmax(from, to) ) %>%distinct(pair, pair_to) %>%# Remove duplicatesselect(from = pair, to = pair_to)# Convert node names to indicesfrom <-match(connections_my_corrected$from, V(mygraph)$name)to <-match(connections_my_corrected$to, V(mygraph)$name)# Plot the hierarchical edge bundle with colored pointsh2 =ggraph(mygraph, layout ='dendrogram', circular =TRUE) +geom_conn_bundle(data =get_con(from = from, to = to), tension =1.5,aes(colour=after_stat(index))) +scale_edge_colour_distiller(palette ="RdPu") +geom_node_point(aes(filter = leaf, color = cluster), # Move points slightly outwardsize =2, alpha =1) +geom_node_text(aes(x = x *1.05, y = y *1.05, # Move text even further outwardfilter = leaf, label = shortName, angle = angle, hjust = hjust), size =2, alpha =1) +scale_color_manual(values = cluster_colors) +coord_fixed() +theme_void() +theme(legend.position ="none",plot.margin =unit(c(0,0,0,0), "cm"), ) +expand_limits(x =c(-1.4, 1.4), y =c(-1.4, 1.4)) # Further expanded limits to accommodate text and pointsh2 ggsave(here::here("Outputs", "Second figures", "HEB_1G_females_clusters.pdf"), h2, device ="pdf", width =88, height =88, units ="mm")```## Male data### Load data and create the dendrogramThis code loads the male data, standardizes the data to one unit variance across sampled groups, calculates euclidean distances, performs hierarchical clustering and creates a dendrogram.```{r}# Step 1: Load and standardize the datadata <-read_excel(here::here("Data/Mol_raw_data.xlsx"), sheet =2)data <-as.data.frame(data)# Ensure the first column becomes the row namesrownames(data) <- data[,1]data <- data[,-1]#Standardise the data scaled_data <- vegan::decostand(data, method="standardize", MARGIN =1)# Step 2: Calculate Euclidean distancesdist_matrix <-dist(scaled_data, method ="euclidean")# Step 3: Apply hierarchical clusteringhc <-hclust(dist_matrix, method ="ward.D2")# Step 4 : Convert to dendrogram dendro <-as.dendrogram(hc)```### Create dataframesThis code creates three dataframes necessary for plotting the charts: edges, vertices and connections```{r}#Step 5: Use ggraph hierarchy_graph =as_tbl_graph(dendro)#Hierarchal dataframe edge_df <- hierarchy_graph %>%activate(edges) %>%as_tibble() %>%rename(from = from, to = to)# Extract vertices dataframevertex_df <- hierarchy_graph %>%activate(nodes) %>%as_tibble() %>%mutate(node_id =row_number())vertices_my <- vertex_df %>%mutate(id =row_number(),name = label,shortName = label, # Assuming you don't have a shorter namegroup =ifelse(leaf, "leaf", "internal")) %>%select(id, name, shortName, group, leaf, height, members) %>%arrange(name) %>%mutate(name =factor(name, levels =unique(name)))# Check for NA or empty names and replace them with a placeholdervertices_my <- vertices_my %>%mutate(shortName =ifelse(is.na(shortName) | shortName =="", paste0("Node", row_number()), shortName)) %>%mutate(name =ifelse(is.na(name) | name =="", paste0("Node_", row_number()), name))# Create connections_my dataframeconnections_my_df <- edge_df %>%left_join(vertex_df, by =c("from"="node_id")) %>%left_join(vertex_df, by =c("to"="node_id"), suffix =c("_from", "_to"))connections_my <- connections_my_df %>%select(from, to) %>%#This selects only the 'from' and 'to' columns from the original dataframe.mutate(from = vertices_my$name[from],to = vertices_my$name[to]) %>%#This replaces the values in the 'from' and 'to' columns with corresponding names from the vertices_my dataframe.distinct() #This removes any duplicate rows from the resulting dataframe.connections_my <- connections_my %>%filter(from !=""& to !="") #This keeps only the rows where from and to are not empty strings # Preparation to draw labels properly:# Extract vertex datavertices_my <- hierarchy_graph %>%activate(nodes) %>%as_tibble() %>%mutate(id =row_number(),name = label,shortName = label, # Keep original labelsgroup =ifelse(leaf, "leaf", "internal"),unique_name =make.unique(as.character(name), sep ="_") ) %>%select(id, name, shortName, unique_name, group, leaf, height, members)# Correct any empty or NA names only for internal nodesvertices_my <- vertices_my %>%mutate(name =ifelse(!leaf & (is.na(name) | name ==""), paste0("Node_", row_number()), name),shortName =ifelse(!leaf & (is.na(shortName) | shortName ==""), paste0("Node", row_number()), shortName) )# Prepare label angles for leavesleaf_vertices <- vertices_my %>%filter(leaf) %>%mutate(leaf_id =row_number(),angle =90-360* (leaf_id -1) /n(),hjust =ifelse(angle <-90, 1, 0),angle =ifelse(angle <-90, angle +180, angle) )# Update vertices_my with leaf anglesvertices_my <- vertices_my %>%left_join(leaf_vertices %>%select(id, angle, hjust), by ="id")```### The circular dendrogramThis code plots a circular dendrogram from the hierarchical clustering results```{r}#Create the graph object mygraph <-graph_from_data_frame(edge_df, vertices = vertices_my)# Basic dendrogram ----ggraph(mygraph, layout ='dendrogram', circular =TRUE) +geom_edge_link(size =0.4, alpha =0.1) +geom_node_text(aes(x = x*1.01, y = y*1.01, filter = leaf, label = shortName, angle = angle, hjust = hjust), size =1.8, alpha =1) +coord_fixed() +theme_void() +theme(legend.position ="none",plot.margin =unit(c(0,0,0,0),"cm"), ) +expand_limits(x =c(-1.2, 1.2), y =c(-1.2, 1.2))```### The hierarchical edge bundlingThis code plots a hierarchical edge bundling chart from the hierarchy defined in the dendrogram```{r}#Correct the connections_my dataframe with valid connections # Get leaf nodesleaf_nodes <-V(mygraph)$name[V(mygraph)$leaf]# Create all possible combinations of leaf nodesall_combinations <-expand.grid(from = leaf_nodes, to = leaf_nodes) %>%mutate(from =as.character(from),to =as.character(to) )# Remove self-connections and duplicate connectionsconnections_my_corrected <- all_combinations %>%filter(from != to) %>%# Remove self-connectionsmutate(pair =pmin(from, to),pair_to =pmax(from, to) ) %>%distinct(pair, pair_to) %>%# Remove duplicatesselect(from = pair, to = pair_to)# Convert node names to indicesfrom <-match(connections_my_corrected$from, V(mygraph)$name)to <-match(connections_my_corrected$to, V(mygraph)$name)# Plot the hierarchical edge bundle h3 =ggraph(mygraph, layout ='dendrogram', circular =TRUE) +geom_conn_bundle(data =get_con(from = from, to = to), alpha =0.2, # Reduced alpha for better visibility# colour="#69b3a2", tension =1.5,aes(colour=after_stat(index))) +scale_edge_colour_distiller(palette ="BuPu") +geom_node_point(aes(filter = leaf), size =2, color ="black") +geom_node_text(aes(x = x*1.05, y=y*1.05, filter = leaf, label=shortName, angle = angle, hjust=hjust), size=2.2, alpha=1) +coord_fixed() +theme_void() +theme(legend.position="none",plot.margin=unit(c(0,0,0,0),"cm"), ) +expand_limits(x =c(-1.4, 1.4), y =c(-1.4, 1.4))h3 ggsave(here::here("Outputs", "Second figures", "HEB_1G_males.pdf"), h3, device ="pdf", width =88, height =88, units ="mm")```### NbclustThe NbClust function is being used to determine the optimal number of clusters in our data. It applies multiple clustering validity indices and methods to suggest the best number of clusters, providing a consensus view based on various criteria to help make a more robust decision about the appropriate cluster count for our dataset.```{r, results='asis', fig.show='hide', message=FALSE, warning=FALSE}library(stringr)library(NbClust)methods <-c("ward.D2")for (m in methods) {# Capture all output output <-capture.output({ result <-NbClust(data = scaled_data, distance ="euclidean", min.nc =2, max.nc =10, method = m, index ="all") })# Join the output into a single string output_text <-paste(output, collapse ="\n")# Split the output by the asterisk separator split_output <-str_split(output_text, "\\*{10,}")[[1]]# Check if we have at least 3 partsif (length(split_output) >=3) {# Trim whitespace from the second part second_part <-str_trim(split_output[2])# Split the second part into lines lines <-str_split(second_part, "\n")[[1]]# Format the output formatted_output <-character(0)for (line in lines) {if (str_detect(line, "Conclusion")) { formatted_output <-c(formatted_output, "", "Conclusion", "") } elseif (str_detect(line, "^\\*")) { formatted_output <-c(formatted_output, line) } }# Join the formatted lines formatted_output <-paste(formatted_output, collapse ="\n")# Print the resultscat(paste("Results for method:", m, "\n\n"))cat(formatted_output)cat("\n\n") # Add some space between results for different methods } else {cat(paste("Unexpected output format for method:", m, "\n")) }}```### Plot the clusters ```{r}cluster_groups <-cutree(hc, k =2)# Add cluster information to vertices_myvertices_my <- vertices_my %>%mutate(cluster =factor(cluster_groups[match(name, names(cluster_groups))]))library(RColorBrewer)# Select three colors from the "Paired" palettecluster_colors <-brewer.pal(2, "Paired")#Create the graph object mygraph <-graph_from_data_frame(edge_df, vertices = vertices_my)#Correct the connections_my dataframe with valid connections # Get leaf nodesleaf_nodes <-V(mygraph)$name[V(mygraph)$leaf]# Create all possible combinations of leaf nodesall_combinations <-expand.grid(from = leaf_nodes, to = leaf_nodes) %>%mutate(from =as.character(from),to =as.character(to) )# Remove self-connections and duplicate connectionsconnections_my_corrected <- all_combinations %>%filter(from != to) %>%# Remove self-connectionsmutate(pair =pmin(from, to),pair_to =pmax(from, to) ) %>%distinct(pair, pair_to) %>%# Remove duplicatesselect(from = pair, to = pair_to)# Convert node names to indicesfrom <-match(connections_my_corrected$from, V(mygraph)$name)to <-match(connections_my_corrected$to, V(mygraph)$name)# Plot the hierarchical edge bundle with colored pointsh4 =ggraph(mygraph, layout ='dendrogram', circular =TRUE) +geom_conn_bundle(data =get_con(from = from, to = to), tension =1.5,aes(colour=after_stat(index))) +scale_edge_colour_distiller(palette ="BuPu") +geom_node_point(aes(filter = leaf, color = cluster, x = x, y = y), # Move points slightly outwardsize =2, alpha =1) +geom_node_text(aes(x = x *1.05, y = y *1.05, # Move text even further outwardfilter = leaf, label = shortName, angle = angle, hjust = hjust), size =2.2, alpha =1) +scale_color_manual(values = cluster_colors) +coord_fixed() +theme_void() +theme(legend.position ="none",plot.margin =unit(c(0,0,0,0), "cm"), ) +expand_limits(x =c(-1.4, 1.4), y =c(-1.4, 1.4)) # Further expanded limits to accommodate text and pointsh4 ggsave(here::here("Outputs", "Second figures", "HEB_1G_males_clusters.pdf"), h4, device ="pdf", width =88, height =88, units ="mm")```# Molecular analytes and cardiometabolic health markers ## Female data### Load data and create the dendrogramThis code loads the female data, standardizes the data to one unit variance across sampled groups, calculates euclidean distances, performs hierarchical clustering and creates a dendrogram.```{r}# Step 1: Load and standardize the datadata <-read_excel(here::here("Data/Mol_raw_data.xlsx"), sheet =3)data <-as.data.frame(data)# Ensure the first column becomes the row namesrownames(data) <- data[,1]data <- data[,-1]#Standardise the data scaled_data <- vegan::decostand(data, method="standardize", MARGIN =1)# Step 2: Calculate Euclidean distancesdist_matrix <-dist(scaled_data, method ="euclidean")# Step 3: Apply hierarchical clusteringhc <-hclust(dist_matrix, method ="ward.D2")# Step 4 : Convert to dendrogram dendro <-as.dendrogram(hc)```### Create dataframesThis code creates three dataframes necessary for plotting the charts: edges, vertices and connections```{r}#Step 5: Use ggraph hierarchy_graph =as_tbl_graph(dendro)#Hierarchal dataframe edge_df <- hierarchy_graph %>%activate(edges) %>%as_tibble() %>%rename(from = from, to = to)# Extract vertices dataframevertex_df <- hierarchy_graph %>%activate(nodes) %>%as_tibble() %>%mutate(node_id =row_number())vertices_my <- vertex_df %>%mutate(id =row_number(),name = label,shortName = label, # Assuming you don't have a shorter namegroup =ifelse(leaf, "leaf", "internal")) %>%select(id, name, shortName, group, leaf, height, members) %>%arrange(name) %>%mutate(name =factor(name, levels =unique(name)))# Check for NA or empty names and replace them with a placeholdervertices_my <- vertices_my %>%mutate(shortName =ifelse(is.na(shortName) | shortName =="", paste0("Node", row_number()), shortName)) %>%mutate(name =ifelse(is.na(name) | name =="", paste0("Node_", row_number()), name))# Create connections_my dataframeconnections_my_df <- edge_df %>%left_join(vertex_df, by =c("from"="node_id")) %>%left_join(vertex_df, by =c("to"="node_id"), suffix =c("_from", "_to"))connections_my <- connections_my_df %>%select(from, to) %>%#This selects only the 'from' and 'to' columns from the original dataframe.mutate(from = vertices_my$name[from],to = vertices_my$name[to]) %>%#This replaces the values in the 'from' and 'to' columns with corresponding names from the vertices_my dataframe.distinct() #This removes any duplicate rows from the resulting dataframe.connections_my <- connections_my %>%filter(from !=""& to !="") #This keeps only the rows where from and to are not empty strings # Preparation to draw labels properly:# Extract vertex datavertices_my <- hierarchy_graph %>%activate(nodes) %>%as_tibble() %>%mutate(id =row_number(),name = label,shortName = label, # Keep original labelsgroup =ifelse(leaf, "leaf", "internal"),unique_name =make.unique(as.character(name), sep ="_") ) %>%select(id, name, shortName, unique_name, group, leaf, height, members)# Correct any empty or NA names only for internal nodesvertices_my <- vertices_my %>%mutate(name =ifelse(!leaf & (is.na(name) | name ==""), paste0("Node_", row_number()), name),shortName =ifelse(!leaf & (is.na(shortName) | shortName ==""), paste0("Node", row_number()), shortName) )# Prepare label angles for leavesleaf_vertices <- vertices_my %>%filter(leaf) %>%mutate(leaf_id =row_number(),angle =90-360* (leaf_id -1) /n(),hjust =ifelse(angle <-90, 1, 0),angle =ifelse(angle <-90, angle +180, angle) )# Update vertices_my with leaf anglesvertices_my <- vertices_my %>%left_join(leaf_vertices %>%select(id, angle, hjust), by ="id")```### The circular dendrogramThis code plots a circular dendrogram from the hierarchical clustering results```{r}#Create the graph object mygraph <-graph_from_data_frame(edge_df, vertices = vertices_my)# Basic dendrogram ----ggraph(mygraph, layout ='dendrogram', circular =TRUE) +geom_edge_link(size =0.4, alpha =0.1) +geom_node_text(aes(x = x*1.01, y = y*1.01, filter = leaf, label = shortName, angle = angle, hjust = hjust), size =1.5, alpha =1) +coord_fixed() +theme_void() +theme(legend.position ="none",plot.margin =unit(c(0,0,0,0),"cm"), ) +expand_limits(x =c(-1.2, 1.2), y =c(-1.2, 1.2))```### The hierarchical edge bundlingThis code plots a hierarchical edge bundling chart from the hierarchy defined in the dendrogram```{r}#Correct the connections_my dataframe with valid connections # Get leaf nodesleaf_nodes <-V(mygraph)$name[V(mygraph)$leaf]# Create all possible combinations of leaf nodesall_combinations <-expand.grid(from = leaf_nodes, to = leaf_nodes) %>%mutate(from =as.character(from),to =as.character(to) )# Remove self-connections and duplicate connectionsconnections_my_corrected <- all_combinations %>%filter(from != to) %>%# Remove self-connectionsmutate(pair =pmin(from, to),pair_to =pmax(from, to) ) %>%distinct(pair, pair_to) %>%# Remove duplicatesselect(from = pair, to = pair_to)# Convert node names to indicesfrom <-match(connections_my_corrected$from, V(mygraph)$name)to <-match(connections_my_corrected$to, V(mygraph)$name)# Plot the hierarchical edge bundle h5 =ggraph(mygraph, layout ='dendrogram', circular =TRUE) +geom_conn_bundle(data =get_con(from = from, to = to), alpha =0.2, # Reduced alpha for better visibility# colour="#69b3a2", tension =1.5,aes(colour=after_stat(index))) +scale_edge_colour_distiller(palette ="RdPu") +geom_node_point(aes(filter = leaf), size =2, color ="black") +geom_node_text(aes(x = x*1.05, y=y*1.05, filter = leaf, label=shortName, angle = angle, hjust=hjust), size=2.2, alpha=1) +coord_fixed() +theme_void() +theme(legend.position="none",plot.margin=unit(c(0,0,0,0),"cm"), ) +expand_limits(x =c(-1.4, 1.4), y =c(-1.4, 1.4))h5ggsave(here::here("Outputs", "Second figures", "HEB_2G_females.pdf"), h5, device ="pdf", width =88, height =88, units ="mm")```### With cardiometabolic biomarkersThis code plots a hierarchical edge bundling chart from the hierarchy defined in the dendrogram highlighting which markers are molecular analytes and which are cardiometabolic health markers```{r}# Step 1: Load and standardize the datamarkers <-read_excel(here::here("Data/Mol_raw_data.xlsx"), sheet =5)markers_females=markers[,c(2:3)]library(dplyr)vertices_my <- vertices_my %>%left_join(markers_females, by =c("name"="Females")) %>%rename(Markers = Type) %>%mutate(Markers =factor(Markers))# Define colors for the two groupsmarker_levels <-levels(vertices_my$Markers)group_colors <-setNames(c("black", "#D2B48C"), marker_levels)mygraph <-graph_from_data_frame(edge_df, vertices = vertices_my)#Correct the connections_my dataframe with valid connections # Get leaf nodesleaf_nodes <-V(mygraph)$name[V(mygraph)$leaf]# Create all possible combinations of leaf nodesall_combinations <-expand.grid(from = leaf_nodes, to = leaf_nodes) %>%mutate(from =as.character(from),to =as.character(to) )# Remove self-connections and duplicate connectionsconnections_my_corrected <- all_combinations %>%filter(from != to) %>%# Remove self-connectionsmutate(pair =pmin(from, to),pair_to =pmax(from, to) ) %>%distinct(pair, pair_to) %>%# Remove duplicatesselect(from = pair, to = pair_to)# Convert node names to indicesfrom <-match(connections_my_corrected$from, V(mygraph)$name)to <-match(connections_my_corrected$to, V(mygraph)$name)# Plot the hierarchical edge bundle with colored pointsh6 =ggraph(mygraph, layout ='dendrogram', circular =TRUE) +geom_conn_bundle(data =get_con(from = from, to = to), tension =1.5,aes(colour =after_stat(index)),show.legend =FALSE) +# Hide edge color legendscale_edge_colour_distiller(palette ="RdPu") +geom_node_point(aes(filter = leaf, color = Markers # Use 'group' for coloring ),size =2, alpha =1) +geom_node_text(aes(x = x *1.05, y = y *1.05,filter = leaf, label = shortName, angle = angle, hjust = hjust), size =1.8, alpha =1) +scale_color_manual(values = group_colors, name =NULL) +# Name the legendcoord_fixed() +theme_void() +theme(legend.position ="bottom",legend.text =element_text(size =6),plot.margin =unit(c(0,0,0,0),"cm"), ) +expand_limits(x =c(-1.4, 1.4), y =c(-1.4, 1.4))h6 ggsave(here::here("Outputs", "Second figures", "HEB_2G_females_legend.pdf"), h6, device ="pdf", width =88, height =88, units ="mm")```### NbclustThe NbClust function is being used to determine the optimal number of clusters in our data. It applies multiple clustering validity indices and methods to suggest the best number of clusters, providing a consensus view based on various criteria to help make a more robust decision about the appropriate cluster count for our dataset.```{r, results='asis', fig.show='hide', message=FALSE, warning=FALSE}library(stringr)library(NbClust)methods <-c("ward.D2")for (m in methods) {# Capture all output output <-capture.output({ result <-NbClust(data = scaled_data, distance ="euclidean", min.nc =2, max.nc =10, method = m, index ="all") })# Join the output into a single string output_text <-paste(output, collapse ="\n")# Split the output by the asterisk separator split_output <-str_split(output_text, "\\*{10,}")[[1]]# Check if we have at least 3 partsif (length(split_output) >=3) {# Trim whitespace from the second part second_part <-str_trim(split_output[2])# Split the second part into lines lines <-str_split(second_part, "\n")[[1]]# Format the output formatted_output <-character(0)for (line in lines) {if (str_detect(line, "Conclusion")) { formatted_output <-c(formatted_output, "", "Conclusion", "") } elseif (str_detect(line, "^\\*")) { formatted_output <-c(formatted_output, line) } }# Join the formatted lines formatted_output <-paste(formatted_output, collapse ="\n")# Print the resultscat(paste("Results for method:", m, "\n\n"))cat(formatted_output)cat("\n\n") # Add some space between results for different methods } else {cat(paste("Unexpected output format for method:", m, "\n")) }}```### Plot the clusters ```{r}cluster_groups <-cutree(hc, k =3)# Add cluster information to vertices_myvertices_my <- vertices_my %>%mutate(cluster =factor(cluster_groups[match(name, names(cluster_groups))]))library(RColorBrewer)# Select three colors from the "Paired" palettecluster_colors <-brewer.pal(3, "Paired")#Create the graph object mygraph <-graph_from_data_frame(edge_df, vertices = vertices_my)#Correct the connections_my dataframe with valid connections # Get leaf nodesleaf_nodes <-V(mygraph)$name[V(mygraph)$leaf]# Create all possible combinations of leaf nodesall_combinations <-expand.grid(from = leaf_nodes, to = leaf_nodes) %>%mutate(from =as.character(from),to =as.character(to) )# Remove self-connections and duplicate connectionsconnections_my_corrected <- all_combinations %>%filter(from != to) %>%# Remove self-connectionsmutate(pair =pmin(from, to),pair_to =pmax(from, to) ) %>%distinct(pair, pair_to) %>%# Remove duplicatesselect(from = pair, to = pair_to)# Convert node names to indicesfrom <-match(connections_my_corrected$from, V(mygraph)$name)to <-match(connections_my_corrected$to, V(mygraph)$name)# Plot the hierarchical edge bundle with colored pointsh5b =ggraph(mygraph, layout ='dendrogram', circular =TRUE) +geom_conn_bundle(data =get_con(from = from, to = to), tension =1.5,aes(colour=after_stat(index))) +scale_edge_colour_distiller(palette ="RdPu") +geom_node_point(aes(filter = leaf, color = cluster, x = x *1.07, y = y *1.07), # Move points slightly outwardsize =3, alpha =0.6) +geom_node_text(aes(x = x *1.14, y = y *1.14, # Move text even further outwardfilter = leaf, label = shortName, angle = angle, hjust = hjust), size =1.5, alpha =1) +scale_color_manual(values = cluster_colors) +coord_fixed() +theme_void() +theme(legend.position ="none",plot.margin =unit(c(0,0,0,0), "cm"), ) +expand_limits(x =c(-1.4, 1.4), y =c(-1.4, 1.4)) # Further expanded limits to accommodate text and pointsh5bggsave(here::here("Outputs", "Second figures", "HEB_2G_females_clusters.pdf"), h5b, device ="pdf", width =88, height =88, units ="mm")```## Male data### Load data and create the dendrogramThis code loads the male data, standardizes the data to one unit variance across sampled groups, calculates euclidean distances, performs hierarchical clustering and creates a dendrogram.```{r}# Step 1: Load and standardize the datadata <-read_excel(here::here("Data/Mol_raw_data.xlsx"), sheet =4)data <-as.data.frame(data)# Ensure the first column becomes the row namesrownames(data) <- data[,1]data <- data[,-1]#Standardise the data scaled_data <- vegan::decostand(data, method="standardize", MARGIN =1)# Step 2: Calculate Euclidean distancesdist_matrix <-dist(scaled_data, method ="euclidean")# Step 3: Apply hierarchical clusteringhc <-hclust(dist_matrix, method ="ward.D2")# Step 4 : Convert to dendrogram dendro <-as.dendrogram(hc)```### Create dataframesThis code creates three dataframes necessary for plotting the charts: edges, vertices and connections```{r}#Step 5: Use ggraph hierarchy_graph =as_tbl_graph(dendro)#Hierarchal dataframe edge_df <- hierarchy_graph %>%activate(edges) %>%as_tibble() %>%rename(from = from, to = to)# Extract vertices dataframevertex_df <- hierarchy_graph %>%activate(nodes) %>%as_tibble() %>%mutate(node_id =row_number())vertices_my <- vertex_df %>%mutate(id =row_number(),name = label,shortName = label, # Assuming you don't have a shorter namegroup =ifelse(leaf, "leaf", "internal")) %>%select(id, name, shortName, group, leaf, height, members) %>%arrange(name) %>%mutate(name =factor(name, levels =unique(name)))# Check for NA or empty names and replace them with a placeholdervertices_my <- vertices_my %>%mutate(shortName =ifelse(is.na(shortName) | shortName =="", paste0("Node", row_number()), shortName)) %>%mutate(name =ifelse(is.na(name) | name =="", paste0("Node_", row_number()), name))# Create connections_my dataframeconnections_my_df <- edge_df %>%left_join(vertex_df, by =c("from"="node_id")) %>%left_join(vertex_df, by =c("to"="node_id"), suffix =c("_from", "_to"))connections_my <- connections_my_df %>%select(from, to) %>%#This selects only the 'from' and 'to' columns from the original dataframe.mutate(from = vertices_my$name[from],to = vertices_my$name[to]) %>%#This replaces the values in the 'from' and 'to' columns with corresponding names from the vertices_my dataframe.distinct() #This removes any duplicate rows from the resulting dataframe.connections_my <- connections_my %>%filter(from !=""& to !="") #This keeps only the rows where from and to are not empty strings # Preparation to draw labels properly:# Extract vertex datavertices_my <- hierarchy_graph %>%activate(nodes) %>%as_tibble() %>%mutate(id =row_number(),name = label,shortName = label, # Keep original labelsgroup =ifelse(leaf, "leaf", "internal"),unique_name =make.unique(as.character(name), sep ="_") ) %>%select(id, name, shortName, unique_name, group, leaf, height, members)# Correct any empty or NA names only for internal nodesvertices_my <- vertices_my %>%mutate(name =ifelse(!leaf & (is.na(name) | name ==""), paste0("Node_", row_number()), name),shortName =ifelse(!leaf & (is.na(shortName) | shortName ==""), paste0("Node", row_number()), shortName) )# Prepare label angles for leavesleaf_vertices <- vertices_my %>%filter(leaf) %>%mutate(leaf_id =row_number(),angle =90-360* (leaf_id -1) /n(),hjust =ifelse(angle <-90, 1, 0),angle =ifelse(angle <-90, angle +180, angle) )# Update vertices_my with leaf anglesvertices_my <- vertices_my %>%left_join(leaf_vertices %>%select(id, angle, hjust), by ="id")```### The circular dendrogramThis code plots a circular dendrogram from the hierarchical clustering results```{r}#Create the graph object mygraph <-graph_from_data_frame(edge_df, vertices = vertices_my)# Basic dendrogram ----ggraph(mygraph, layout ='dendrogram', circular =TRUE) +geom_edge_link(size =0.4, alpha =0.1) +geom_node_text(aes(x = x*1.01, y = y*1.01, filter = leaf, label = shortName, angle = angle, hjust = hjust), size =1.5, alpha =1) +coord_fixed() +theme_void() +theme(legend.position ="none",plot.margin =unit(c(0,0,0,0),"cm"), ) +expand_limits(x =c(-1.2, 1.2), y =c(-1.2, 1.2))```### The hierarchical edge bundlingThis code plots a hierarchical edge bundling chart from the hierarchy defined in the dendrogram```{r}#Correct the connections_my dataframe with valid connections # Get leaf nodesleaf_nodes <-V(mygraph)$name[V(mygraph)$leaf]# Create all possible combinations of leaf nodesall_combinations <-expand.grid(from = leaf_nodes, to = leaf_nodes) %>%mutate(from =as.character(from),to =as.character(to) )# Remove self-connections and duplicate connectionsconnections_my_corrected <- all_combinations %>%filter(from != to) %>%# Remove self-connectionsmutate(pair =pmin(from, to),pair_to =pmax(from, to) ) %>%distinct(pair, pair_to) %>%# Remove duplicatesselect(from = pair, to = pair_to)# Convert node names to indicesfrom <-match(connections_my_corrected$from, V(mygraph)$name)to <-match(connections_my_corrected$to, V(mygraph)$name)# Plot the hierarchical edge bundle h7 =ggraph(mygraph, layout ='dendrogram', circular =TRUE) +geom_conn_bundle(data =get_con(from = from, to = to), alpha =0.2, # Reduced alpha for better visibility# colour="#69b3a2", tension =1.5,aes(colour=after_stat(index))) +scale_edge_colour_distiller(palette ="BuPu") +geom_node_point(aes(filter = leaf), size =2, color ="black") +geom_node_text(aes(x = x*1.05, y=y*1.05, filter = leaf, label=shortName, angle = angle, hjust=hjust), size=2, alpha=1) +coord_fixed() +theme_void() +theme(legend.position="none",plot.margin=unit(c(0,0,0,0),"cm"), ) +expand_limits(x =c(-1.4, 1.4), y =c(-1.4, 1.4))h7ggsave(here::here("Outputs", "Second figures", "HEB_2G_males.pdf"), h7, device ="pdf", width =88, height =88, units ="mm")```### With cardiometabolic biomarkersThis code plots a hierarchical edge bundling chart from the hierarchy defined in the dendrogram highlighting which markers are molecular analytes and which are cardiometabolic health markers```{r}# Step 1: Load and standardize the datamarkers <-read_excel(here::here("Data/Mol_raw_data.xlsx"), sheet =5)markers_males=markers[,c(1:3)]library(dplyr)vertices_my <- vertices_my %>%left_join(markers_males, by =c("name"="Males")) %>%rename(Markers = Type) %>%mutate(Markers =factor(Markers))# Define colors for the two groupsmarker_levels <-levels(vertices_my$Markers)group_colors <-setNames(c("black", "#D2B48C"), marker_levels)mygraph <-graph_from_data_frame(edge_df, vertices = vertices_my)#Correct the connections_my dataframe with valid connections # Get leaf nodesleaf_nodes <-V(mygraph)$name[V(mygraph)$leaf]# Create all possible combinations of leaf nodesall_combinations <-expand.grid(from = leaf_nodes, to = leaf_nodes) %>%mutate(from =as.character(from),to =as.character(to) )# Remove self-connections and duplicate connectionsconnections_my_corrected <- all_combinations %>%filter(from != to) %>%# Remove self-connectionsmutate(pair =pmin(from, to),pair_to =pmax(from, to) ) %>%distinct(pair, pair_to) %>%# Remove duplicatesselect(from = pair, to = pair_to)# Convert node names to indicesfrom <-match(connections_my_corrected$from, V(mygraph)$name)to <-match(connections_my_corrected$to, V(mygraph)$name)# Plot the hierarchical edge bundle with colored pointsh8 =ggraph(mygraph, layout ='dendrogram', circular =TRUE) +geom_conn_bundle(data =get_con(from = from, to = to), tension =1.5,aes(colour =after_stat(index)),show.legend =FALSE) +# Hide edge color legendscale_edge_colour_distiller(palette ="BuPu") +geom_node_point(aes(filter = leaf, color = Markers # Use 'group' for coloring ),size =2, alpha =1) +geom_node_text(aes(x = x *1.05, y = y *1.05,filter = leaf, label = shortName, angle = angle, hjust = hjust), size =1.8, alpha =1) +scale_color_manual(values = group_colors, name =NULL) +# Name the legendcoord_fixed() +theme_void() +theme(legend.position ="bottom",legend.text =element_text(size =6),plot.margin =unit(c(0,0,0,0),"cm"), ) +expand_limits(x =c(-1.4, 1.4), y =c(-1.4, 1.4))h8ggsave(here::here("Outputs", "Second figures", "HEB_2G_males_legend.pdf"), h8, device ="pdf", width =88, height =88, units ="mm")```### NbclustThe NbClust function is being used to determine the optimal number of clusters in our data. It applies multiple clustering validity indices and methods to suggest the best number of clusters, providing a consensus view based on various criteria to help make a more robust decision about the appropriate cluster count for our dataset.```{r, results='asis', fig.show='hide', message=FALSE, warning=FALSE}library(NbClust)library(dplyr)library(purrr)all_indices <-c("kl", "ch", "hartigan", "cindex", "db", "silhouette", "duda", "pseudot2", "beale", "ratkowsky", "ball", "ptbiserial", "gap", "mcclain", "gamma", "gplus", "tau", "dunn", "hubert", "sdindex", "dindex", "sdbw")run_nbclust <-function(index) {tryCatch({# Suppress output result <-capture.output({ nb_result <-NbClust(data = scaled_data, distance ="euclidean", min.nc =2, max.nc =10, method ="ward.D2", index = index) })return(list(index = index, result = nb_result, error =NULL)) }, error =function(e) {return(list(index = index, result =NULL, error =conditionMessage(e))) })}results <-map(all_indices, run_nbclust)# Extract working indices and their resultsworking_results <- results %>%keep(~is.null(.$error)) %>%map(~ .$result)# Combine resultsif (length(working_results) >0) { combined_result <-list(All.index =do.call(c, map(working_results, ~ .$All.index)),Best.nc =do.call(rbind, map(working_results, ~ .$Best.nc)),Best.partition = working_results[[1]]$Best.partition )# Count the proposed best number of clusters best_nc_counts <-table(combined_result$Best.nc[, 1])# Print results in the requested formatcat("Results for method: ward.D2\n\n")cat("Among all indices:\n\n")for (nc insort(as.numeric(names(best_nc_counts)))) { count <- best_nc_counts[as.character(nc)]cat(sprintf("* %d proposed %d as the best number of clusters\n", count, nc)) }cat("\nConclusion\n\n") best_nc <-as.numeric(names(which.max(best_nc_counts)))cat(sprintf("According to the majority rule, the best number of clusters is %d\n", best_nc))} else {cat("No indices worked successfully.\n")}```### Plot the clusters ```{r}cluster_groups <-cutree(hc, k =10)# Add cluster information to vertices_myvertices_my <- vertices_my %>%mutate(cluster =factor(cluster_groups[match(name, names(cluster_groups))]))library(RColorBrewer)# Select three colors from the "Paired" palettecluster_colors <-brewer.pal(10, "Paired")#Create the graph object mygraph <-graph_from_data_frame(edge_df, vertices = vertices_my)#Correct the connections_my dataframe with valid connections # Get leaf nodesleaf_nodes <-V(mygraph)$name[V(mygraph)$leaf]# Create all possible combinations of leaf nodesall_combinations <-expand.grid(from = leaf_nodes, to = leaf_nodes) %>%mutate(from =as.character(from),to =as.character(to) )# Remove self-connections and duplicate connectionsconnections_my_corrected <- all_combinations %>%filter(from != to) %>%# Remove self-connectionsmutate(pair =pmin(from, to),pair_to =pmax(from, to) ) %>%distinct(pair, pair_to) %>%# Remove duplicatesselect(from = pair, to = pair_to)# Convert node names to indicesfrom <-match(connections_my_corrected$from, V(mygraph)$name)to <-match(connections_my_corrected$to, V(mygraph)$name)# Plot the hierarchical edge bundle with colored pointsh9 =ggraph(mygraph, layout ='dendrogram', circular =TRUE) +geom_conn_bundle(data =get_con(from = from, to = to), tension =1.5,aes(colour=after_stat(index))) +scale_edge_colour_distiller(palette ="BuPu") +geom_node_point(aes(filter = leaf, color = cluster, x = x *1.07, y = y *1.07), # Move points slightly outwardsize =3, alpha =0.6) +geom_node_text(aes(x = x *1.14, y = y *1.14, # Move text even further outwardfilter = leaf, label = shortName, angle = angle, hjust = hjust), size =1.6, alpha =1) +scale_color_manual(values = cluster_colors) +coord_fixed() +theme_void() +theme(legend.position ="none",plot.margin =unit(c(0,0,0,0), "cm"), ) +expand_limits(x =c(-1.4, 1.4), y =c(-1.4, 1.4)) # Further expanded limits to accommodate text and pointsh9ggsave(here::here("Outputs", "Second figures", "HEB_2G_males_clusters.pdf"), h9, device ="pdf", width =88, height =88, units ="mm")```